1 /*
2 * Copyright (C) 2009 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package com.google.common.base;
18
19 import static com.google.common.base.Preconditions.checkArgument;
20 import static com.google.common.base.Preconditions.checkNotNull;
21
22 import com.google.common.annotations.Beta;
23 import com.google.common.annotations.GwtCompatible;
24
25 import java.util.ArrayList;
26 import java.util.Collections;
27 import java.util.Iterator;
28 import java.util.LinkedHashMap;
29 import java.util.List;
30 import java.util.Map;
31
32 import javax.annotation.CheckReturnValue;
33
34 /**
35 * Extracts non-overlapping substrings from an input string, typically by
36 * recognizing appearances of a <i>separator</i> sequence. This separator can be
37 * specified as a single {@linkplain #on(char) character}, fixed {@linkplain
38 * #on(String) string}, {@linkplain #onPattern regular expression} or {@link
39 * #on(CharMatcher) CharMatcher} instance. Or, instead of using a separator at
40 * all, a splitter can extract adjacent substrings of a given {@linkplain
41 * #fixedLength fixed length}.
42 *
43 * <p>For example, this expression: <pre> {@code
44 *
45 * Splitter.on(',').split("foo,bar,qux")}</pre>
46 *
47 * ... produces an {@code Iterable} containing {@code "foo"}, {@code "bar"} and
48 * {@code "qux"}, in that order.
49 *
50 * <p>By default, {@code Splitter}'s behavior is simplistic and unassuming. The
51 * following expression: <pre> {@code
52 *
53 * Splitter.on(',').split(" foo,,, bar ,")}</pre>
54 *
55 * ... yields the substrings {@code [" foo", "", "", " bar ", ""]}. If this
56 * is not the desired behavior, use configuration methods to obtain a <i>new</i>
57 * splitter instance with modified behavior: <pre> {@code
58 *
59 * private static final Splitter MY_SPLITTER = Splitter.on(',')
60 * .trimResults()
61 * .omitEmptyStrings();}</pre>
62 *
63 * <p>Now {@code MY_SPLITTER.split("foo,,, bar ,")} returns just {@code ["foo",
64 * "bar"]}. Note that the order in which these configuration methods are called
65 * is never significant.
66 *
67 * <p><b>Warning:</b> Splitter instances are immutable. Invoking a configuration
68 * method has no effect on the receiving instance; you must store and use the
69 * new splitter instance it returns instead. <pre> {@code
70 *
71 * // Do NOT do this
72 * Splitter splitter = Splitter.on('/');
73 * splitter.trimResults(); // does nothing!
74 * return splitter.split("wrong / wrong / wrong");}</pre>
75 *
76 * <p>For separator-based splitters that do not use {@code omitEmptyStrings}, an
77 * input string containing {@code n} occurrences of the separator naturally
78 * yields an iterable of size {@code n + 1}. So if the separator does not occur
79 * anywhere in the input, a single substring is returned containing the entire
80 * input. Consequently, all splitters split the empty string to {@code [""]}
81 * (note: even fixed-length splitters).
82 *
83 * <p>Splitter instances are thread-safe immutable, and are therefore safe to
84 * store as {@code static final} constants.
85 *
86 * <p>The {@link Joiner} class provides the inverse operation to splitting, but
87 * note that a round-trip between the two should be assumed to be lossy.
88 *
89 * <p>See the Guava User Guide article on <a href=
90 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#Splitter">
91 * {@code Splitter}</a>.
92 *
93 * @author Julien Silland
94 * @author Jesse Wilson
95 * @author Kevin Bourrillion
96 * @author Louis Wasserman
97 * @since 1.0
98 */
99 @GwtCompatible(emulated = true)
100 public final class Splitter {
101 private final CharMatcher trimmer;
102 private final boolean omitEmptyStrings;
103 private final Strategy strategy;
104 private final int limit;
105
106 private Splitter(Strategy strategy) {
107 this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE);
108 }
109
110 private Splitter(Strategy strategy, boolean omitEmptyStrings,
111 CharMatcher trimmer, int limit) {
112 this.strategy = strategy;
113 this.omitEmptyStrings = omitEmptyStrings;
114 this.trimmer = trimmer;
115 this.limit = limit;
116 }
117
118 /**
119 * Returns a splitter that uses the given single-character separator. For
120 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable
121 * containing {@code ["foo", "", "bar"]}.
122 *
123 * @param separator the character to recognize as a separator
124 * @return a splitter, with default settings, that recognizes that separator
125 */
126 public static Splitter on(char separator) {
127 return on(CharMatcher.is(separator));
128 }
129
130 /**
131 * Returns a splitter that considers any single character matched by the
132 * given {@code CharMatcher} to be a separator. For example, {@code
133 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an
134 * iterable containing {@code ["foo", "", "bar", "quux"]}.
135 *
136 * @param separatorMatcher a {@link CharMatcher} that determines whether a
137 * character is a separator
138 * @return a splitter, with default settings, that uses this matcher
139 */
140 public static Splitter on(final CharMatcher separatorMatcher) {
141 checkNotNull(separatorMatcher);
142
143 return new Splitter(new Strategy() {
144 @Override public SplittingIterator iterator(
145 Splitter splitter, final CharSequence toSplit) {
146 return new SplittingIterator(splitter, toSplit) {
147 @Override int separatorStart(int start) {
148 return separatorMatcher.indexIn(toSplit, start);
149 }
150
151 @Override int separatorEnd(int separatorPosition) {
152 return separatorPosition + 1;
153 }
154 };
155 }
156 });
157 }
158
159 /**
160 * Returns a splitter that uses the given fixed string as a separator. For
161 * example, {@code Splitter.on(", ").split("foo, bar,baz")} returns an
162 * iterable containing {@code ["foo", "bar,baz"]}.
163 *
164 * @param separator the literal, nonempty string to recognize as a separator
165 * @return a splitter, with default settings, that recognizes that separator
166 */
167 public static Splitter on(final String separator) {
168 checkArgument(separator.length() != 0,
169 "The separator may not be the empty string.");
170
171 return new Splitter(new Strategy() {
172 @Override public SplittingIterator iterator(
173 Splitter splitter, CharSequence toSplit) {
174 return new SplittingIterator(splitter, toSplit) {
175 @Override public int separatorStart(int start) {
176 int separatorLength = separator.length();
177
178 positions:
179 for (int p = start, last = toSplit.length() - separatorLength;
180 p <= last; p++) {
181 for (int i = 0; i < separatorLength; i++) {
182 if (toSplit.charAt(i + p) != separator.charAt(i)) {
183 continue positions;
184 }
185 }
186 return p;
187 }
188 return -1;
189 }
190
191 @Override public int separatorEnd(int separatorPosition) {
192 return separatorPosition + separator.length();
193 }
194 };
195 }
196 });
197 }
198
199 /**
200 * Returns a splitter that divides strings into pieces of the given length.
201 * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an
202 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be
203 * smaller than {@code length} but will never be empty.
204 *
205 * <p><b>Exception:</b> for consistency with separator-based splitters, {@code
206 * split("")} does not yield an empty iterable, but an iterable containing
207 * {@code ""}. This is the only case in which {@code
208 * Iterables.size(split(input))} does not equal {@code
209 * IntMath.divide(input.length(), length, CEILING)}. To avoid this behavior,
210 * use {@code omitEmptyStrings}.
211 *
212 * @param length the desired length of pieces after splitting, a positive
213 * integer
214 * @return a splitter, with default settings, that can split into fixed sized
215 * pieces
216 * @throws IllegalArgumentException if {@code length} is zero or negative
217 */
218 public static Splitter fixedLength(final int length) {
219 checkArgument(length > 0, "The length may not be less than 1");
220
221 return new Splitter(new Strategy() {
222 @Override public SplittingIterator iterator(
223 final Splitter splitter, CharSequence toSplit) {
224 return new SplittingIterator(splitter, toSplit) {
225 @Override public int separatorStart(int start) {
226 int nextChunkStart = start + length;
227 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1);
228 }
229
230 @Override public int separatorEnd(int separatorPosition) {
231 return separatorPosition;
232 }
233 };
234 }
235 });
236 }
237
238 /**
239 * Returns a splitter that behaves equivalently to {@code this} splitter, but
240 * automatically omits empty strings from the results. For example, {@code
241 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an
242 * iterable containing only {@code ["a", "b", "c"]}.
243 *
244 * <p>If either {@code trimResults} option is also specified when creating a
245 * splitter, that splitter always trims results first before checking for
246 * emptiness. So, for example, {@code
247 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns
248 * an empty iterable.
249 *
250 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)}
251 * to return an empty iterable, but when using this option, it can (if the
252 * input sequence consists of nothing but separators).
253 *
254 * @return a splitter with the desired configuration
255 */
256 @CheckReturnValue
257 public Splitter omitEmptyStrings() {
258 return new Splitter(strategy, true, trimmer, limit);
259 }
260
261 /**
262 * Returns a splitter that behaves equivalently to {@code this} splitter but
263 * stops splitting after it reaches the limit.
264 * The limit defines the maximum number of items returned by the iterator.
265 *
266 * <p>For example,
267 * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable
268 * containing {@code ["a", "b", "c,d"]}. When omitting empty strings, the
269 * omitted strings do no count. Hence,
270 * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")}
271 * returns an iterable containing {@code ["a", "b", "c,d"}.
272 * When trim is requested, all entries, including the last are trimmed. Hence
273 * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")}
274 * results in @{code ["a", "b", "c , d"]}.
275 *
276 * @param limit the maximum number of items returns
277 * @return a splitter with the desired configuration
278 * @since 9.0
279 */
280 @CheckReturnValue
281 public Splitter limit(int limit) {
282 checkArgument(limit > 0, "must be greater than zero: %s", limit);
283 return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
284 }
285
286 /**
287 * Returns a splitter that behaves equivalently to {@code this} splitter, but
288 * automatically removes leading and trailing {@linkplain
289 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent
290 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code
291 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable
292 * containing {@code ["a", "b", "c"]}.
293 *
294 * @return a splitter with the desired configuration
295 */
296 @CheckReturnValue
297 public Splitter trimResults() {
298 return trimResults(CharMatcher.WHITESPACE);
299 }
300
301 /**
302 * Returns a splitter that behaves equivalently to {@code this} splitter, but
303 * removes all leading or trailing characters matching the given {@code
304 * CharMatcher} from each returned substring. For example, {@code
305 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")}
306 * returns an iterable containing {@code ["a ", "b_ ", "c"]}.
307 *
308 * @param trimmer a {@link CharMatcher} that determines whether a character
309 * should be removed from the beginning/end of a subsequence
310 * @return a splitter with the desired configuration
311 */
312 // TODO(kevinb): throw if a trimmer was already specified!
313 @CheckReturnValue
314 public Splitter trimResults(CharMatcher trimmer) {
315 checkNotNull(trimmer);
316 return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
317 }
318
319 /**
320 * Splits {@code sequence} into string components and makes them available
321 * through an {@link Iterator}, which may be lazily evaluated. If you want
322 * an eagerly computed {@link List}, use {@link #splitToList(CharSequence)}.
323 *
324 * @param sequence the sequence of characters to split
325 * @return an iteration over the segments split from the parameter.
326 */
327 public Iterable<String> split(final CharSequence sequence) {
328 checkNotNull(sequence);
329
330 return new Iterable<String>() {
331 @Override public Iterator<String> iterator() {
332 return splittingIterator(sequence);
333 }
334 @Override public String toString() {
335 return Joiner.on(", ")
336 .appendTo(new StringBuilder().append('['), this)
337 .append(']')
338 .toString();
339 }
340 };
341 }
342
343 private Iterator<String> splittingIterator(CharSequence sequence) {
344 return strategy.iterator(this, sequence);
345 }
346
347 /**
348 * Splits {@code sequence} into string components and returns them as
349 * an immutable list. If you want an {@link Iterable} which may be lazily
350 * evaluated, use {@link #split(CharSequence)}.
351 *
352 * @param sequence the sequence of characters to split
353 * @return an immutable list of the segments split from the parameter
354 * @since 15.0
355 */
356 @Beta
357 public List<String> splitToList(CharSequence sequence) {
358 checkNotNull(sequence);
359
360 Iterator<String> iterator = splittingIterator(sequence);
361 List<String> result = new ArrayList<String>();
362
363 while (iterator.hasNext()) {
364 result.add(iterator.next());
365 }
366
367 return Collections.unmodifiableList(result);
368 }
369
370 /**
371 * Returns a {@code MapSplitter} which splits entries based on this splitter,
372 * and splits entries into keys and values using the specified separator.
373 *
374 * @since 10.0
375 */
376 @CheckReturnValue
377 @Beta
378 public MapSplitter withKeyValueSeparator(String separator) {
379 return withKeyValueSeparator(on(separator));
380 }
381
382 /**
383 * Returns a {@code MapSplitter} which splits entries based on this splitter,
384 * and splits entries into keys and values using the specified separator.
385 *
386 * @since 14.0
387 */
388 @CheckReturnValue
389 @Beta
390 public MapSplitter withKeyValueSeparator(char separator) {
391 return withKeyValueSeparator(on(separator));
392 }
393
394 /**
395 * Returns a {@code MapSplitter} which splits entries based on this splitter,
396 * and splits entries into keys and values using the specified key-value
397 * splitter.
398 *
399 * @since 10.0
400 */
401 @CheckReturnValue
402 @Beta
403 public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) {
404 return new MapSplitter(this, keyValueSplitter);
405 }
406
407 /**
408 * An object that splits strings into maps as {@code Splitter} splits
409 * iterables and lists. Like {@code Splitter}, it is thread-safe and
410 * immutable.
411 *
412 * @since 10.0
413 */
414 @Beta
415 public static final class MapSplitter {
416 private static final String INVALID_ENTRY_MESSAGE =
417 "Chunk [%s] is not a valid entry";
418 private final Splitter outerSplitter;
419 private final Splitter entrySplitter;
420
421 private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) {
422 this.outerSplitter = outerSplitter; // only "this" is passed
423 this.entrySplitter = checkNotNull(entrySplitter);
424 }
425
426 /**
427 * Splits {@code sequence} into substrings, splits each substring into
428 * an entry, and returns an unmodifiable map with each of the entries. For
429 * example, <code>
430 * Splitter.on(';').trimResults().withKeyValueSeparator("=>")
431 * .split("a=>b ; c=>b")
432 * </code> will return a mapping from {@code "a"} to {@code "b"} and
433 * {@code "c"} to {@code b}.
434 *
435 * <p>The returned map preserves the order of the entries from
436 * {@code sequence}.
437 *
438 * @throws IllegalArgumentException if the specified sequence does not split
439 * into valid map entries, or if there are duplicate keys
440 */
441 public Map<String, String> split(CharSequence sequence) {
442 Map<String, String> map = new LinkedHashMap<String, String>();
443 for (String entry : outerSplitter.split(sequence)) {
444 Iterator<String> entryFields = entrySplitter.splittingIterator(entry);
445
446 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
447 String key = entryFields.next();
448 checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key);
449
450 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
451 String value = entryFields.next();
452 map.put(key, value);
453
454 checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
455 }
456 return Collections.unmodifiableMap(map);
457 }
458 }
459
460 private interface Strategy {
461 Iterator<String> iterator(Splitter splitter, CharSequence toSplit);
462 }
463
464 private abstract static class SplittingIterator extends AbstractIterator<String> {
465 final CharSequence toSplit;
466 final CharMatcher trimmer;
467 final boolean omitEmptyStrings;
468
469 /**
470 * Returns the first index in {@code toSplit} at or after {@code start}
471 * that contains the separator.
472 */
473 abstract int separatorStart(int start);
474
475 /**
476 * Returns the first index in {@code toSplit} after {@code
477 * separatorPosition} that does not contain a separator. This method is only
478 * invoked after a call to {@code separatorStart}.
479 */
480 abstract int separatorEnd(int separatorPosition);
481
482 int offset = 0;
483 int limit;
484
485 protected SplittingIterator(Splitter splitter, CharSequence toSplit) {
486 this.trimmer = splitter.trimmer;
487 this.omitEmptyStrings = splitter.omitEmptyStrings;
488 this.limit = splitter.limit;
489 this.toSplit = toSplit;
490 }
491
492 @Override protected String computeNext() {
493 /*
494 * The returned string will be from the end of the last match to the
495 * beginning of the next one. nextStart is the start position of the
496 * returned substring, while offset is the place to start looking for a
497 * separator.
498 */
499 int nextStart = offset;
500 while (offset != -1) {
501 int start = nextStart;
502 int end;
503
504 int separatorPosition = separatorStart(offset);
505 if (separatorPosition == -1) {
506 end = toSplit.length();
507 offset = -1;
508 } else {
509 end = separatorPosition;
510 offset = separatorEnd(separatorPosition);
511 }
512 if (offset == nextStart) {
513 /*
514 * This occurs when some pattern has an empty match, even if it
515 * doesn't match the empty string -- for example, if it requires
516 * lookahead or the like. The offset must be increased to look for
517 * separators beyond this point, without changing the start position
518 * of the next returned substring -- so nextStart stays the same.
519 */
520 offset++;
521 if (offset >= toSplit.length()) {
522 offset = -1;
523 }
524 continue;
525 }
526
527 while (start < end && trimmer.matches(toSplit.charAt(start))) {
528 start++;
529 }
530 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
531 end--;
532 }
533
534 if (omitEmptyStrings && start == end) {
535 // Don't include the (unused) separator in next split string.
536 nextStart = offset;
537 continue;
538 }
539
540 if (limit == 1) {
541 // The limit has been reached, return the rest of the string as the
542 // final item. This is tested after empty string removal so that
543 // empty strings do not count towards the limit.
544 end = toSplit.length();
545 offset = -1;
546 // Since we may have changed the end, we need to trim it again.
547 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
548 end--;
549 }
550 } else {
551 limit--;
552 }
553
554 return toSplit.subSequence(start, end).toString();
555 }
556 return endOfData();
557 }
558 }
559 }
560